knitr::opts_chunk$set(echo = TRUE)
require("ggplot2")
## Loading required package: ggplot2
require ("plyr")
## Loading required package: plyr
require ("caTools")
## Loading required package: caTools
The secrets to getting ahead in this company:
1.Work for many years with several different companies before coming to this company. Your Total Time Worked will provide you with a higher job level and higher monthly income.
2.Change Managers frequently. The longer you stay with a manager the longer the time before you will receive you next promotion.
3.Aspire and work toward being a manager or director. These job roles make more money. If you stay with the company long enough and have many years of total work time before coming to the company, you should be able to obtain one of these position.
4.Do not work as a Sales Representative. This job role makes less money.
5. Don’t worry about job performance ratings. Everyone receives either an outstanding or an excellent rating. The ratings have not impact on your monthly income.
## Number of employees in the dataset:
## 1470
## Number of variables in the dataset:
## 35
## Number of employees who have left the company and the number if employees who remain:
## Left Freq
## No 1233
## Yes 237
## 3 Variables had no variation,that is, are constants and will not aid in the analysis
## These variables are removed from the dataset.
## Number of variables in the dataset, after removing constants:
## 32
## Count of field types in the dataset:
##
## factor integer
## 8 24
ggplot(data=as.data.frame(table(attrit$Department,attrit$JobRole,dnn=list("Department","JobRole"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Department)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Number of Employees per Job Role by Department") + xlab("") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4")))
# Recode Managers
attrit$JobRole <-with(attrit, ifelse(attrit$JobRole == "Manager",paste(attrit$Department,attrit$JobRole), as.character(attrit$JobRole)))
attrit$JobRole <- as.factor(attrit$JobRole)
ggplot(data=as.data.frame(table(attrit$Department,attrit$JobRole,dnn=list("Department","JobRole"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Department)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Number of Employees per Job Role by Department \nafter recoding") + xlab("") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4")))
#Recode the variable "JobRole" to a numeric value for modeling
attrit$JRCode[attrit$JobRole=="Healthcare Representative"] <- 1L
attrit$JRCode[attrit$JobRole=="Human Resources"] <- 2L
attrit$JRCode[attrit$JobRole=="Laboratory Technician"] <- 3L
attrit$JRCode[attrit$JobRole=="Manufacturing Director"] <- 4L
attrit$JRCode[attrit$JobRole=="Research Director"] <- 5L
attrit$JRCode[attrit$JobRole=="Sales Executive"] <- 6L
attrit$JRCode[attrit$JobRole=="Research Scientist"] <- 7L
attrit$JRCode[attrit$JobRole=="Sales Representative"] <- 8L
attrit$JRCode[attrit$JobRole=="Human Resources Manager"] <- 9L
attrit$JRCode[attrit$JobRole=="Research & Development Manager"] <- 10L
0.0008
## [1] 8e-04
#Create a dataframe of current employees only
noattrit <- attrit[attrit$Attrition=="No",]
noattrit$Attrit <- NULL
attrition<-read.csv('CaseStudy2reorder.csv', header = T)
#View(attrition)
head(attrition)
## X Attrition BusinessTravel Department EducationField
## 1 1 Yes Travel_Rarely Sales Life Sciences
## 2 2 No Travel_Frequently Research & Development Life Sciences
## 3 3 Yes Travel_Rarely Research & Development Other
## 4 4 No Travel_Frequently Research & Development Life Sciences
## 5 5 No Travel_Rarely Research & Development Medical
## 6 6 No Travel_Frequently Research & Development Life Sciences
## Gender JobRole OverTime MaritalStatus Age DailyRate
## 1 Female Sales Executive Yes Single 41 1102
## 2 Male Research Scientist No Married 49 279
## 3 Male Laboratory Technician Yes Single 37 1373
## 4 Female Research Scientist Yes Married 33 1392
## 5 Male Laboratory Technician No Married 27 591
## 6 Male Laboratory Technician No Single 32 1005
## DistanceFromHome Education EmployeeNumber EnvironmentSatisfaction
## 1 1 2 1 2
## 2 8 1 2 3
## 3 2 2 4 4
## 4 3 4 5 4
## 5 2 1 7 1
## 6 2 2 8 4
## HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome
## 1 94 3 2 4 5993
## 2 61 2 2 2 5130
## 3 92 2 1 3 2090
## 4 56 3 1 3 2909
## 5 40 3 1 2 3468
## 6 79 3 1 4 3068
## MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## 1 19479 8 11 3
## 2 24907 1 23 4
## 3 2396 6 15 3
## 4 23159 1 11 3
## 5 16632 9 12 3
## 6 11864 0 13 3
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## 1 1 0 8
## 2 4 1 10
## 3 2 0 7
## 4 3 0 8
## 5 4 1 6
## 6 3 0 8
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 1 0 1 6 4
## 2 3 3 10 7
## 3 3 3 0 0
## 4 3 3 8 7
## 5 3 3 2 2
## 6 2 2 7 7
## YearsSinceLastPromotion YearsWithCurrManager
## 1 0 5
## 2 1 7
## 3 0 0
## 4 3 0
## 5 2 2
## 6 3 6
attrition<-attrition[,-1] # 1st col obsolete
str(attrition)
## 'data.frame': 1470 obs. of 32 variables:
## $ Attrition : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
## $ BusinessTravel : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
## $ Department : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
## $ EducationField : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
## $ JobRole : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
## $ OverTime : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
## $ MaritalStatus : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
## $ Age : int 41 49 37 33 27 32 59 30 38 36 ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
summary(attrition)
## Attrition BusinessTravel Department
## No :1233 Non-Travel : 150 Human Resources : 63
## Yes: 237 Travel_Frequently: 277 Research & Development:961
## Travel_Rarely :1043 Sales :446
##
##
##
##
## EducationField Gender JobRole
## Human Resources : 27 Female:588 Sales Executive :326
## Life Sciences :606 Male :882 Research Scientist :292
## Marketing :159 Laboratory Technician :259
## Medical :464 Manufacturing Director :145
## Other : 82 Healthcare Representative:131
## Technical Degree:132 Manager :102
## (Other) :215
## OverTime MaritalStatus Age DailyRate
## No :1054 Divorced:327 Min. :18.00 Min. : 102.0
## Yes: 416 Married :673 1st Qu.:30.00 1st Qu.: 465.0
## Single :470 Median :36.00 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## DistanceFromHome Education EmployeeNumber EnvironmentSatisfaction
## Min. : 1.000 Min. :1.000 Min. : 1.0 Min. :1.000
## 1st Qu.: 2.000 1st Qu.:2.000 1st Qu.: 491.2 1st Qu.:2.000
## Median : 7.000 Median :3.000 Median :1020.5 Median :3.000
## Mean : 9.193 Mean :2.913 Mean :1024.9 Mean :2.722
## 3rd Qu.:14.000 3rd Qu.:4.000 3rd Qu.:1555.8 3rd Qu.:4.000
## Max. :29.000 Max. :5.000 Max. :2068.0 Max. :4.000
##
## HourlyRate JobInvolvement JobLevel JobSatisfaction
## Min. : 30.00 Min. :1.00 Min. :1.000 Min. :1.000
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 1st Qu.:2.000
## Median : 66.00 Median :3.00 Median :2.000 Median :3.000
## Mean : 65.89 Mean :2.73 Mean :2.064 Mean :2.729
## 3rd Qu.: 83.75 3rd Qu.:3.00 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :100.00 Max. :4.00 Max. :5.000 Max. :4.000
##
## MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike
## Min. : 1009 Min. : 2094 Min. :0.000 Min. :11.00
## 1st Qu.: 2911 1st Qu.: 8047 1st Qu.:1.000 1st Qu.:12.00
## Median : 4919 Median :14236 Median :2.000 Median :14.00
## Mean : 6503 Mean :14313 Mean :2.693 Mean :15.21
## 3rd Qu.: 8379 3rd Qu.:20462 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :19999 Max. :26999 Max. :9.000 Max. :25.00
##
## PerformanceRating RelationshipSatisfaction StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :1.0000
## Mean :3.154 Mean :2.712 Mean :0.7939
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
##
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.28 Mean :2.799 Mean :2.761 Mean : 7.008
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
##
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 3.000 Median : 1.000 Median : 3.000
## Mean : 4.229 Mean : 2.188 Mean : 4.123
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :18.000 Max. :15.000 Max. :17.000
##
attrition$Education<-as.factor(attrition$Education)
attrition$EnvironmentSatisfaction<-as.factor(attrition$EnvironmentSatisfaction)
attrition$JobInvolvement<-as.factor(attrition$JobInvolvement)
attrition$JobLevel<-as.factor(attrition$JobLevel)
attrition$JobSatisfaction<-as.factor(attrition$JobSatisfaction)
attrition$PerformanceRating<-as.factor(attrition$PerformanceRating)
attrition$RelationshipSatisfaction<-as.factor(attrition$RelationshipSatisfaction)
attrition$StockOptionLevel<-as.factor(attrition$StockOptionLevel)
attrition$WorkLifeBalance<-as.factor(attrition$WorkLifeBalance)
summary(attrition)
## Attrition BusinessTravel Department
## No :1233 Non-Travel : 150 Human Resources : 63
## Yes: 237 Travel_Frequently: 277 Research & Development:961
## Travel_Rarely :1043 Sales :446
##
##
##
##
## EducationField Gender JobRole
## Human Resources : 27 Female:588 Sales Executive :326
## Life Sciences :606 Male :882 Research Scientist :292
## Marketing :159 Laboratory Technician :259
## Medical :464 Manufacturing Director :145
## Other : 82 Healthcare Representative:131
## Technical Degree:132 Manager :102
## (Other) :215
## OverTime MaritalStatus Age DailyRate
## No :1054 Divorced:327 Min. :18.00 Min. : 102.0
## Yes: 416 Married :673 1st Qu.:30.00 1st Qu.: 465.0
## Single :470 Median :36.00 Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
##
## DistanceFromHome Education EmployeeNumber EnvironmentSatisfaction
## Min. : 1.000 1:170 Min. : 1.0 1:284
## 1st Qu.: 2.000 2:282 1st Qu.: 491.2 2:287
## Median : 7.000 3:572 Median :1020.5 3:453
## Mean : 9.193 4:398 Mean :1024.9 4:446
## 3rd Qu.:14.000 5: 48 3rd Qu.:1555.8
## Max. :29.000 Max. :2068.0
##
## HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome
## Min. : 30.00 1: 83 1:543 1:289 Min. : 1009
## 1st Qu.: 48.00 2:375 2:534 2:280 1st Qu.: 2911
## Median : 66.00 3:868 3:218 3:442 Median : 4919
## Mean : 65.89 4:144 4:106 4:459 Mean : 6503
## 3rd Qu.: 83.75 5: 69 3rd Qu.: 8379
## Max. :100.00 Max. :19999
##
## MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## Min. : 2094 Min. :0.000 Min. :11.00 3:1244
## 1st Qu.: 8047 1st Qu.:1.000 1st Qu.:12.00 4: 226
## Median :14236 Median :2.000 Median :14.00
## Mean :14313 Mean :2.693 Mean :15.21
## 3rd Qu.:20462 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :26999 Max. :9.000 Max. :25.00
##
## RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## 1:276 0:631 Min. : 0.00
## 2:303 1:596 1st Qu.: 6.00
## 3:459 2:158 Median :10.00
## 4:432 3: 85 Mean :11.28
## 3rd Qu.:15.00
## Max. :40.00
##
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## Min. :0.000 1: 80 Min. : 0.000 Min. : 0.000
## 1st Qu.:2.000 2:344 1st Qu.: 3.000 1st Qu.: 2.000
## Median :3.000 3:893 Median : 5.000 Median : 3.000
## Mean :2.799 4:153 Mean : 7.008 Mean : 4.229
## 3rd Qu.:3.000 3rd Qu.: 9.000 3rd Qu.: 7.000
## Max. :6.000 Max. :40.000 Max. :18.000
##
## YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 1.000 Median : 3.000
## Mean : 2.188 Mean : 4.123
## 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :15.000 Max. :17.000
##
# attach for ease of running code
attach(attrition)
set.seed(123)
split <- sample.split(attrition$Attrition, SplitRatio = 0.80)
#get training and test data
data.train <- subset(attrition, split == TRUE)
data.test <- subset(attrition, split == FALSE)
options(width = 300) ## print
null.model.train<-glm(formula=Attrition~1, family = binomial(link="logit"), data=data.train)
full.model.train<-glm(formula=Attrition~., family = binomial(link="logit"), data=data.train)
summary(full.model.train)
##
## Call:
## glm(formula = Attrition ~ ., family = binomial(link = "logit"),
## data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8888 -0.4561 -0.1967 -0.0584 3.6739
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.397e+00 6.424e+02 -0.015 0.988330
## BusinessTravelTravel_Frequently 1.973e+00 4.971e-01 3.970 7.19e-05 ***
## BusinessTravelTravel_Rarely 9.463e-01 4.612e-01 2.052 0.040162 *
## DepartmentResearch & Development 1.347e+01 6.424e+02 0.021 0.983277
## DepartmentSales 1.347e+01 6.424e+02 0.021 0.983274
## EducationFieldLife Sciences -9.159e-01 9.705e-01 -0.944 0.345297
## EducationFieldMarketing -5.541e-01 1.026e+00 -0.540 0.589208
## EducationFieldMedical -1.061e+00 9.657e-01 -1.099 0.271850
## EducationFieldOther -1.062e+00 1.052e+00 -1.010 0.312496
## EducationFieldTechnical Degree -1.206e-01 9.912e-01 -0.122 0.903150
## GenderMale 4.060e-01 2.172e-01 1.869 0.061656 .
## JobRoleHuman Resources 1.378e+01 6.424e+02 0.021 0.982890
## JobRoleLaboratory Technician 6.647e-01 6.741e-01 0.986 0.324097
## JobRoleManager -1.572e-01 1.220e+00 -0.129 0.897457
## JobRoleManufacturing Director 5.664e-01 6.402e-01 0.885 0.376268
## JobRoleResearch Director -2.135e+00 1.388e+00 -1.538 0.124131
## JobRoleResearch Scientist -6.835e-01 7.058e-01 -0.968 0.332903
## JobRoleSales Executive 1.434e+00 1.450e+00 0.989 0.322653
## JobRoleSales Representative 1.165e+00 1.528e+00 0.763 0.445612
## OverTimeYes 2.066e+00 2.357e-01 8.769 < 2e-16 ***
## MaritalStatusMarried 3.710e-01 3.209e-01 1.156 0.247708
## MaritalStatusSingle 6.425e-01 4.564e-01 1.408 0.159182
## Age -2.812e-02 1.589e-02 -1.770 0.076713 .
## DailyRate -3.854e-04 2.606e-04 -1.479 0.139271
## DistanceFromHome 5.149e-02 1.286e-02 4.003 6.26e-05 ***
## Education2 3.560e-01 3.918e-01 0.909 0.363550
## Education3 2.119e-01 3.502e-01 0.605 0.545108
## Education4 3.073e-01 3.747e-01 0.820 0.412153
## Education5 2.933e-01 7.307e-01 0.401 0.688090
## EmployeeNumber -1.677e-04 1.839e-04 -0.912 0.361818
## EnvironmentSatisfaction2 -1.334e+00 3.360e-01 -3.969 7.22e-05 ***
## EnvironmentSatisfaction3 -1.250e+00 2.980e-01 -4.195 2.73e-05 ***
## EnvironmentSatisfaction4 -1.410e+00 2.990e-01 -4.716 2.40e-06 ***
## HourlyRate 4.870e-03 5.254e-03 0.927 0.354039
## JobInvolvement2 -1.309e+00 4.309e-01 -3.037 0.002388 **
## JobInvolvement3 -1.681e+00 4.089e-01 -4.112 3.93e-05 ***
## JobInvolvement4 -2.041e+00 5.415e-01 -3.769 0.000164 ***
## JobLevel2 -1.677e+00 5.263e-01 -3.186 0.001444 **
## JobLevel3 3.282e-01 8.144e-01 0.403 0.686981
## JobLevel4 -6.879e-01 1.400e+00 -0.491 0.623217
## JobLevel5 1.452e+00 1.975e+00 0.735 0.462083
## JobSatisfaction2 -4.466e-01 3.169e-01 -1.409 0.158823
## JobSatisfaction3 -5.249e-01 2.866e-01 -1.831 0.067051 .
## JobSatisfaction4 -1.214e+00 3.064e-01 -3.961 7.47e-05 ***
## MonthlyIncome -1.526e-04 1.070e-04 -1.425 0.154053
## MonthlyRate 1.950e-05 1.482e-05 1.316 0.188204
## NumCompaniesWorked 2.007e-01 4.632e-02 4.332 1.48e-05 ***
## PercentSalaryHike -3.810e-02 4.673e-02 -0.815 0.414835
## PerformanceRating4 5.542e-02 4.846e-01 0.114 0.908947
## RelationshipSatisfaction2 -8.635e-01 3.411e-01 -2.532 0.011356 *
## RelationshipSatisfaction3 -8.888e-01 3.029e-01 -2.934 0.003344 **
## RelationshipSatisfaction4 -9.020e-01 3.004e-01 -3.003 0.002671 **
## StockOptionLevel1 -1.050e+00 3.627e-01 -2.894 0.003803 **
## StockOptionLevel2 -1.066e+00 4.908e-01 -2.172 0.029866 *
## StockOptionLevel3 -5.810e-01 5.815e-01 -0.999 0.317755
## TotalWorkingYears -6.285e-02 3.547e-02 -1.772 0.076398 .
## TrainingTimesLastYear -1.377e-01 8.333e-02 -1.652 0.098548 .
## WorkLifeBalance2 -1.040e+00 4.259e-01 -2.441 0.014628 *
## WorkLifeBalance3 -1.571e+00 3.999e-01 -3.930 8.51e-05 ***
## WorkLifeBalance4 -1.262e+00 4.845e-01 -2.604 0.009216 **
## YearsAtCompany 1.210e-01 4.956e-02 2.442 0.014605 *
## YearsInCurrentRole -1.620e-01 5.740e-02 -2.823 0.004758 **
## YearsSinceLastPromotion 1.455e-01 5.012e-02 2.904 0.003690 **
## YearsWithCurrManager -1.246e-01 5.428e-02 -2.295 0.021737 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1040.18 on 1175 degrees of freedom
## Residual deviance: 633.03 on 1112 degrees of freedom
## AIC: 761.03
##
## Number of Fisher Scoring iterations: 15
#sort summary by p-value descending
tempDF<-as.data.frame(summary(full.model.train)$coefficients)
tempDF[order(tempDF$`Pr(>|z|)`),]
## Estimate Std. Error z value Pr(>|z|)
## OverTimeYes 2.066496e+00 2.356697e-01 8.76861346 1.808798e-18
## EnvironmentSatisfaction4 -1.410304e+00 2.990337e-01 -4.71620534 2.402839e-06
## NumCompaniesWorked 2.006728e-01 4.632009e-02 4.33230549 1.475560e-05
## EnvironmentSatisfaction3 -1.250120e+00 2.980255e-01 -4.19467519 2.732629e-05
## JobInvolvement3 -1.681390e+00 4.089481e-01 -4.11150059 3.930959e-05
## DistanceFromHome 5.148581e-02 1.286262e-02 4.00274540 6.261167e-05
## BusinessTravelTravel_Frequently 1.973438e+00 4.970795e-01 3.97006641 7.185260e-05
## EnvironmentSatisfaction2 -1.333660e+00 3.360157e-01 -3.96904008 7.216274e-05
## JobSatisfaction4 -1.213550e+00 3.063924e-01 -3.96077056 7.470829e-05
## WorkLifeBalance3 -1.571416e+00 3.998989e-01 -3.92953309 8.511096e-05
## JobInvolvement4 -2.040537e+00 5.414673e-01 -3.76853114 1.642110e-04
## JobLevel2 -1.676527e+00 5.262505e-01 -3.18579714 1.443558e-03
## JobInvolvement2 -1.308709e+00 4.308921e-01 -3.03720794 2.387807e-03
## RelationshipSatisfaction4 -9.020471e-01 3.003511e-01 -3.00330930 2.670609e-03
## RelationshipSatisfaction3 -8.887712e-01 3.029005e-01 -2.93420155 3.344070e-03
## YearsSinceLastPromotion 1.455322e-01 5.012251e-02 2.90352946 3.689823e-03
## StockOptionLevel1 -1.049812e+00 3.627458e-01 -2.89407158 3.802815e-03
## YearsInCurrentRole -1.620440e-01 5.740223e-02 -2.82295655 4.758302e-03
## WorkLifeBalance4 -1.261649e+00 4.845143e-01 -2.60394504 9.215753e-03
## RelationshipSatisfaction2 -8.634599e-01 3.410797e-01 -2.53154875 1.135600e-02
## YearsAtCompany 1.210194e-01 4.955692e-02 2.44202874 1.460498e-02
## WorkLifeBalance2 -1.039934e+00 4.259488e-01 -2.44145443 1.462823e-02
## YearsWithCurrManager -1.245621e-01 5.427698e-02 -2.29493430 2.173689e-02
## StockOptionLevel2 -1.065877e+00 4.907660e-01 -2.17186337 2.986597e-02
## BusinessTravelTravel_Rarely 9.463215e-01 4.611534e-01 2.05207533 4.016234e-02
## GenderMale 4.059671e-01 2.172388e-01 1.86875916 6.165633e-02
## JobSatisfaction3 -5.249475e-01 2.866475e-01 -1.83133497 6.705056e-02
## TotalWorkingYears -6.285103e-02 3.546935e-02 -1.77198141 7.639764e-02
## Age -2.811934e-02 1.588587e-02 -1.77008462 7.671304e-02
## TrainingTimesLastYear -1.376637e-01 8.333496e-02 -1.65193159 9.854850e-02
## JobRoleResearch Director -2.134856e+00 1.388377e+00 -1.53766315 1.241310e-01
## DailyRate -3.853517e-04 2.606352e-04 -1.47850948 1.392715e-01
## MonthlyIncome -1.525736e-04 1.070422e-04 -1.42535949 1.540533e-01
## JobSatisfaction2 -4.465805e-01 3.169395e-01 -1.40904014 1.588233e-01
## MaritalStatusSingle 6.425277e-01 4.563969e-01 1.40782657 1.591824e-01
## MonthlyRate 1.950394e-05 1.482162e-05 1.31591095 1.882039e-01
## MaritalStatusMarried 3.709706e-01 3.209272e-01 1.15593375 2.477083e-01
## EducationFieldMedical -1.061073e+00 9.656534e-01 -1.09881307 2.718496e-01
## EducationFieldOther -1.062406e+00 1.051888e+00 -1.00999854 3.124960e-01
## StockOptionLevel3 -5.809903e-01 5.815237e-01 -0.99908270 3.177546e-01
## JobRoleSales Executive 1.434065e+00 1.449984e+00 0.98902152 3.226526e-01
## JobRoleLaboratory Technician 6.647463e-01 6.741342e-01 0.98607410 3.240968e-01
## JobRoleResearch Scientist -6.834538e-01 7.058412e-01 -0.96828274 3.329032e-01
## EducationFieldLife Sciences -9.159258e-01 9.705168e-01 -0.94375056 3.452971e-01
## HourlyRate 4.869592e-03 5.254288e-03 0.92678425 3.540386e-01
## EmployeeNumber -1.677276e-04 1.839306e-04 -0.91190664 3.618179e-01
## Education2 3.559715e-01 3.917708e-01 0.90862200 3.635497e-01
## JobRoleManufacturing Director 5.664383e-01 6.401926e-01 0.88479366 3.762679e-01
## Education4 3.073242e-01 3.747351e-01 0.82011057 4.121531e-01
## PercentSalaryHike -3.810140e-02 4.672639e-02 -0.81541506 4.148348e-01
## JobRoleSales Representative 1.165268e+00 1.527716e+00 0.76275124 4.456118e-01
## JobLevel5 1.452142e+00 1.974573e+00 0.73542100 4.620831e-01
## Education3 2.119179e-01 3.502153e-01 0.60510745 5.451076e-01
## EducationFieldMarketing -5.540912e-01 1.026125e+00 -0.53998403 5.892080e-01
## JobLevel4 -6.878840e-01 1.400140e+00 -0.49129655 6.232167e-01
## JobLevel3 3.281793e-01 8.144302e-01 0.40295569 6.869808e-01
## Education5 2.933429e-01 7.307101e-01 0.40144909 6.880895e-01
## JobRoleManager -1.572259e-01 1.219999e+00 -0.12887382 8.974575e-01
## EducationFieldTechnical Degree -1.206066e-01 9.911554e-01 -0.12168282 9.031502e-01
## PerformanceRating4 5.542427e-02 4.846185e-01 0.11436679 9.089470e-01
## JobRoleHuman Resources 1.377763e+01 6.424413e+02 0.02144574 9.828901e-01
## DepartmentSales 1.346853e+01 6.424410e+02 0.02096462 9.832739e-01
## DepartmentResearch & Development 1.346568e+01 6.424409e+02 0.02096018 9.832774e-01
## (Intercept) -9.396688e+00 6.424429e+02 -0.01462650 9.883302e-01
anova(full.model.train, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Attrition
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 1175 1040.18
## BusinessTravel 2 18.868 1173 1021.32 7.995e-05 ***
## Department 2 8.692 1171 1012.62 0.0129553 *
## EducationField 5 6.340 1166 1006.28 0.2744831
## Gender 1 0.637 1165 1005.65 0.4246324
## JobRole 8 69.556 1157 936.09 6.024e-12 ***
## OverTime 1 61.827 1156 874.26 3.751e-15 ***
## MaritalStatus 2 32.054 1154 842.21 1.095e-07 ***
## Age 1 5.387 1153 836.82 0.0202852 *
## DailyRate 1 3.178 1152 833.64 0.0746336 .
## DistanceFromHome 1 10.304 1151 823.34 0.0013276 **
## Education 4 1.753 1147 821.59 0.7810091
## EmployeeNumber 1 0.109 1146 821.48 0.7416105
## EnvironmentSatisfaction 3 25.105 1143 796.37 1.468e-05 ***
## HourlyRate 1 0.090 1142 796.28 0.7642039
## JobInvolvement 3 19.187 1139 777.10 0.0002501 ***
## JobLevel 4 40.262 1135 736.83 3.821e-08 ***
## JobSatisfaction 3 18.479 1132 718.35 0.0003503 ***
## MonthlyIncome 1 4.053 1131 714.30 0.0440873 *
## MonthlyRate 1 3.116 1130 711.18 0.0775034 .
## NumCompaniesWorked 1 15.350 1129 695.83 8.932e-05 ***
## PercentSalaryHike 1 1.739 1128 694.10 0.1873082
## PerformanceRating 1 0.004 1127 694.09 0.9465417
## RelationshipSatisfaction 3 10.130 1124 683.96 0.0174921 *
## StockOptionLevel 3 8.053 1121 675.91 0.0449197 *
## TotalWorkingYears 1 1.976 1120 673.93 0.1598558
## TrainingTimesLastYear 1 2.900 1119 671.03 0.0885629 .
## WorkLifeBalance 3 15.866 1116 655.17 0.0012078 **
## YearsAtCompany 1 0.917 1115 654.25 0.3382468
## YearsInCurrentRole 1 8.610 1114 645.64 0.0033438 **
## YearsSinceLastPromotion 1 7.425 1113 638.21 0.0064314 **
## YearsWithCurrManager 1 5.180 1112 633.03 0.0228466 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## step(null.model.train, scope=list(upper=full.model.train), direction="both", test="Chisq", data=data.train)
##
## . . .
## < many steps later >
## . . .
## Df Deviance AIC LRT Pr(>Chi)
## <none> 653.05 745.05
## ...
## - DistanceFromHome 1 668.28 758.28 15.224 9.547e-05 ***
## - JobSatisfaction 3 672.87 758.87 19.820 0.0001850 ***
## - JobInvolvement 3 673.25 759.25 20.195 0.0001546 ***
## - NumCompaniesWorked 1 672.21 762.21 19.153 1.207e-05 ***
## - BusinessTravel 2 675.76 763.76 22.705 1.174e-05 ***
## - EnvironmentSatisfaction 3 683.97 769.97 30.921 8.831e-07 ***
## - JobLevel 4 686.83 770.83 33.779 8.271e-07 ***
## - JobRole 8 704.21 780.21 51.153 2.452e-08 ***
## - OverTime 1 743.18 833.18 90.130 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## . . .
## Degrees of Freedom: 1175 Total (i.e. Null); 1130 Residual
## Null Deviance: 1040
## Residual Deviance: 649.2 AIC: 741.2
final.model.train<-glm(formula=Attrition~OverTime+JobRole+JobLevel+EnvironmentSatisfaction+BusinessTravel+NumCompaniesWorked+JobInvolvement+JobSatisfaction+DistanceFromHome, family = binomial(link="logit"), data=data.train)
summary(final.model.train)
##
## Call:
## glm(formula = Attrition ~ OverTime + JobRole + JobLevel + EnvironmentSatisfaction +
## BusinessTravel + NumCompaniesWorked + JobInvolvement + JobSatisfaction +
## DistanceFromHome, family = binomial(link = "logit"), data = data.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7672 -0.5318 -0.2972 -0.1384 2.9413
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.376960 0.809049 -0.466 0.641266
## OverTimeYes 1.703728 0.199136 8.556 < 2e-16 ***
## JobRoleHuman Resources 0.282504 0.679131 0.416 0.677426
## JobRoleLaboratory Technician 0.439641 0.592724 0.742 0.458250
## JobRoleManager -0.971051 0.989569 -0.981 0.326452
## JobRoleManufacturing Director 0.004675 0.569080 0.008 0.993446
## JobRoleResearch Director -2.384876 1.190992 -2.002 0.045239 *
## JobRoleResearch Scientist -0.545522 0.612119 -0.891 0.372821
## JobRoleSales Executive 1.296621 0.457348 2.835 0.004581 **
## JobRoleSales Representative 1.080430 0.647221 1.669 0.095051 .
## JobLevel2 -2.075147 0.420737 -4.932 8.13e-07 ***
## JobLevel3 -1.142405 0.497756 -2.295 0.021727 *
## JobLevel4 -1.998374 0.727256 -2.748 0.005999 **
## JobLevel5 -0.617082 1.160975 -0.532 0.595058
## EnvironmentSatisfaction2 -1.236091 0.298756 -4.137 3.51e-05 ***
## EnvironmentSatisfaction3 -1.098946 0.253043 -4.343 1.41e-05 ***
## EnvironmentSatisfaction4 -1.200238 0.258863 -4.637 3.54e-06 ***
## BusinessTravelTravel_Frequently 1.379870 0.428882 3.217 0.001294 **
## BusinessTravelTravel_Rarely 0.593573 0.401833 1.477 0.139632
## NumCompaniesWorked 0.107992 0.037124 2.909 0.003627 **
## JobInvolvement2 -1.063282 0.375174 -2.834 0.004595 **
## JobInvolvement3 -1.540777 0.354925 -4.341 1.42e-05 ***
## JobInvolvement4 -2.007050 0.475901 -4.217 2.47e-05 ***
## JobSatisfaction2 -0.263847 0.285172 -0.925 0.354851
## JobSatisfaction3 -0.407503 0.254084 -1.604 0.108755
## JobSatisfaction4 -1.076902 0.272780 -3.948 7.88e-05 ***
## DistanceFromHome 0.037731 0.011219 3.363 0.000771 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1040.18 on 1175 degrees of freedom
## Residual deviance: 757.71 on 1149 degrees of freedom
## AIC: 811.71
##
## Number of Fisher Scoring iterations: 7
#sort summary by p-value descending
tempDF<-as.data.frame(summary(final.model.train)$coefficients)
tempDF[order(tempDF$`Pr(>|z|)`),]
## Estimate Std. Error z value Pr(>|z|)
## OverTimeYes 1.703728051 0.19913586 8.555606350 1.172508e-17
## JobLevel2 -2.075147485 0.42073695 -4.932173176 8.131977e-07
## EnvironmentSatisfaction4 -1.200237626 0.25886288 -4.636576725 3.542267e-06
## EnvironmentSatisfaction3 -1.098945522 0.25304310 -4.342918298 1.406024e-05
## JobInvolvement3 -1.540776733 0.35492533 -4.341129219 1.417523e-05
## JobInvolvement4 -2.007049927 0.47590144 -4.217364685 2.471741e-05
## EnvironmentSatisfaction2 -1.236091203 0.29875636 -4.137455719 3.511782e-05
## JobSatisfaction4 -1.076902408 0.27277952 -3.947885866 7.884437e-05
## DistanceFromHome 0.037731054 0.01121887 3.363179102 7.705036e-04
## BusinessTravelTravel_Frequently 1.379869980 0.42888209 3.217364400 1.293742e-03
## NumCompaniesWorked 0.107991628 0.03712439 2.908912829 3.626880e-03
## JobRoleSales Executive 1.296621357 0.45734763 2.835089280 4.581288e-03
## JobInvolvement2 -1.063282476 0.37517447 -2.834101357 4.595475e-03
## JobLevel4 -1.998374192 0.72725593 -2.747827989 5.999147e-03
## JobLevel3 -1.142405118 0.49775586 -2.295111348 2.172674e-02
## JobRoleResearch Director -2.384876479 1.19099203 -2.002428577 4.523866e-02
## JobRoleSales Representative 1.080429892 0.64722135 1.669335991 9.505081e-02
## JobSatisfaction3 -0.407503426 0.25408361 -1.603816278 1.087546e-01
## BusinessTravelTravel_Rarely 0.593573311 0.40183349 1.477162370 1.396321e-01
## JobRoleManager -0.971051006 0.98956937 -0.981286438 3.264515e-01
## JobSatisfaction2 -0.263847412 0.28517207 -0.925221783 3.548506e-01
## JobRoleResearch Scientist -0.545521561 0.61211892 -0.891201924 3.728209e-01
## JobRoleLaboratory Technician 0.439641378 0.59272358 0.741730868 4.582504e-01
## JobLevel5 -0.617081935 1.16097506 -0.531520405 5.950582e-01
## (Intercept) -0.376959731 0.80904868 -0.465929603 6.412659e-01
## JobRoleHuman Resources 0.282503674 0.67913145 0.415977897 6.774262e-01
## JobRoleManufacturing Director 0.004674672 0.56908005 0.008214437 9.934459e-01
predict.results <- predict(final.model.train,newdata=subset(data.test,select=c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),type='response')
predict.results <- ifelse(predict.results > 0.5,1,0)
misClasificError <- mean(predict.results)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.918367346938776"
final.model.alldata<-glm(formula=Attrition~OverTime+JobRole+JobLevel+EnvironmentSatisfaction+BusinessTravel+NumCompaniesWorked+JobInvolvement+JobSatisfaction+DistanceFromHome, family = binomial(link="logit"), data=attrition)
summary(final.model.alldata)
##
## Call:
## glm(formula = Attrition ~ OverTime + JobRole + JobLevel + EnvironmentSatisfaction +
## BusinessTravel + NumCompaniesWorked + JobInvolvement + JobSatisfaction +
## DistanceFromHome, family = binomial(link = "logit"), data = attrition)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7212 -0.5375 -0.3002 -0.1340 3.0712
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.83400 0.71661 -1.164 0.244499
## OverTimeYes 1.76957 0.17760 9.964 < 2e-16 ***
## JobRoleHuman Resources 0.47126 0.61437 0.767 0.443041
## JobRoleLaboratory Technician 0.31962 0.53750 0.595 0.552085
## JobRoleManager -0.99151 0.92744 -1.069 0.285033
## JobRoleManufacturing Director 0.05172 0.51121 0.101 0.919414
## JobRoleResearch Director -2.22301 1.03063 -2.157 0.031010 *
## JobRoleResearch Scientist -0.44267 0.55256 -0.801 0.423059
## JobRoleSales Executive 1.34413 0.41115 3.269 0.001078 **
## JobRoleSales Representative 1.09145 0.58897 1.853 0.063863 .
## JobLevel2 -2.08559 0.38254 -5.452 4.98e-08 ***
## JobLevel3 -1.38643 0.45075 -3.076 0.002099 **
## JobLevel4 -2.16495 0.68233 -3.173 0.001509 **
## JobLevel5 -0.25018 1.02437 -0.244 0.807054
## EnvironmentSatisfaction2 -0.99411 0.25952 -3.831 0.000128 ***
## EnvironmentSatisfaction3 -1.09562 0.23010 -4.762 1.92e-06 ***
## EnvironmentSatisfaction4 -1.20054 0.23452 -5.119 3.07e-07 ***
## BusinessTravelTravel_Frequently 1.66670 0.38353 4.346 1.39e-05 ***
## BusinessTravelTravel_Rarely 0.85413 0.35785 2.387 0.016993 *
## NumCompaniesWorked 0.12553 0.03292 3.813 0.000137 ***
## JobInvolvement2 -0.97597 0.32700 -2.985 0.002839 **
## JobInvolvement3 -1.40635 0.30906 -4.550 5.35e-06 ***
## JobInvolvement4 -2.10620 0.43373 -4.856 1.20e-06 ***
## JobSatisfaction2 -0.42584 0.25427 -1.675 0.093976 .
## JobSatisfaction3 -0.48860 0.22383 -2.183 0.029042 *
## JobSatisfaction4 -1.08938 0.23978 -4.543 5.54e-06 ***
## DistanceFromHome 0.04245 0.01011 4.200 2.67e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1298.58 on 1469 degrees of freedom
## Residual deviance: 946.73 on 1443 degrees of freedom
## AIC: 1000.7
##
## Number of Fisher Scoring iterations: 6
| Covarience/Value | Odds Ratio |
|---|---|
| JobRoleSales Representative | 8.3669402 |
| OverTimeYes | 5.3426139 |
| BusinessTravelTravel_Frequently | 4.7865786 |
| JobRoleLaboratory Technician | 3.6904708 |
| JobRoleHuman Resources | 3.4551899 |
| JobRoleSales Executive | 3.1500722 |
| BusinessTravelTravel_Rarely | 2.2161456 |
| JobRoleResearch Scientist | 1.7691884 |
| JobRoleManager | 1.5547907 |
| NumCompaniesWorked | 1.119052 |
| DistanceFromHome | 1.0362319 |
Note: All analysis has been performed on employees only (No attrition participants)
1) Manager Job Role:
a) Only job role that exists in all departments. All other job roles are unique to a department.
b) Have the longest average duration at the company
2) Managers and Directors
a) Tend to be older than people in other job roles
b) Have higher Job Levels
c) Have higher Monthly Income
3) Manufacturing Director
a) The only Job Role where Males do not outnumber Females
4) Human Resource Manager
a) is the only Role which does not have ‘Low’ level of job involvement
b) No PhDs (characteristic in common with Sale Reps)
c) All HR Managers have some College level education (a characteristic unique to this Job Role)
5) Sales Reps and Reseach Scientists
a) have the lowest Job Levels
6) Sales Reps
a) is the only Job Role which has no employees with NumWorked = 9!!
b) have the lowest average duration at the company
c) change managers most frequently, while directors and managers tend to stay with the same manager for a longer time
d) No PhDs (characteristic in common with HR Managers)
e) the highest percentage of employees with college-level education (25%)
ggplot(data=as.data.frame(table(noattrit$Department,noattrit$JobRole,dnn=list("Department","JobRole"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Department)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Number of Employees per Job Role by Department") + xlab("") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4")))
Linear Regression Modeling (See Experiment Tab) suggests Education Field, Age, Job Involvement, Gender, Job Level, Monthly Income, Number of Companies Worked, Years At Company and Years with Current Manager are different between of Job Roles
ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$EducationField,dnn=list("JobRole","EducationField"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = EducationField)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Percent of Employees per Job Role by Education Field") + xlab("") + ylab("Percent of Employees")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "midnightblue", "mistyrose4", "lightcyan3")))
Though not identified as a first order differentiator by linear regression, Education is an interesting parameter in regards to Job Role
ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$Education,dnn=list("JobRole","Education"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Education)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Education by Job Role \nNo PhD Sales Reps nor HR Managers \nAll HR Managers Have Some College") + xlab("") + ylab("% Employees")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "mistyrose4", "lightcyan3")))
All Departments have a mix of educational fields
People with Education in Marketing only work in the Sales Department
People with Education in Human Resources work in the Human Resources Department
R&D is predominantly composed of people with Life Sciences and Medical Education
No PhD Sales Reps nor HR Managers
**All HR Managers Have Some College (unique from all other job roles)“**
Sales Reps have the highest percentage of employees with college-level education (25%)
noattrit$JobRole <-with(noattrit, ifelse(noattrit$JobRole=="Manager", paste(noattrit$Department,noattrit$JobRole), as.character(noattrit$JobRole)))
ggplot(data=noattrit,aes(x= JobRole,y=Age)) +
geom_boxplot() + coord_flip() + ggtitle("Age by Job Role") + xlab("")+ylab("Age")+theme(plot.title=element_text(hjust=0.5))
#Descripitve Statistics of Employee Ages
summary(noattrit$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 36.00 37.56 43.00 60.00
results <- lm(noattrit$JRCode~noattrit$Age)
summary(results)
##
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0652 -2.0573 0.9379 1.9400 4.9427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.0494530 0.2910012 17.352 <2e-16 ***
## noattrit$Age 0.0002632 0.0075984 0.035 0.972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.31 on 1196 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 1.003e-06, Adjusted R-squared: -0.0008351
## F-statistic: 0.0012 on 1 and 1196 DF, p-value: 0.9724
1. Managers and Directors tend be older than people in other roles
2. The oldest employee is a Sales Executive (60 years old)
3. The youngest employee is a Laboratory Technician (18 years old)
ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$Gender,dnn=list("JobRole","Gender"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Gender)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Males OutNumber Females in Most Job Roles \n (Delta is not statiscally significant)") + xlab("") + ylab("Percent of Employees")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4"))) + geom_hline(data=noattrit, aes(yintercept = .50), colour="black")
ggplot(data=noattrit,aes(x= JobRole,y=MonthlyIncome,group=JobRole)) + geom_boxplot() + coord_flip() + ggtitle("Managers and Directors Have Higher Compensation \n (p-value < .00001)") + xlab("")+ylab("Monthly Income")+theme(plot.title=element_text(hjust=0.5))
ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$JobLevel,dnn=list("JobRole","JobLevel"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = JobLevel)) + geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Managers & Directors Tend to Have Higher Job Levels") + xlab("Percent of Employees") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "mistyrose4", "lightcyan3")))
results <- lm(noattrit$JRCode~noattrit$JobLevel)
summary(results)
##
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$JobLevel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.526 -1.795 0.718 2.205 4.718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.55089 0.14517 31.349 < 2e-16 ***
## noattrit$JobLevel 0.24371 0.06191 3.937 8.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.295 on 1196 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.01279, Adjusted R-squared: 0.01197
## F-statistic: 15.5 on 1 and 1196 DF, p-value: 8.742e-05
1. Managers and Directors have higher Job Levels.
2. Sales Reps and Research Scientists have the lowest Job Levels.
ggplot(data=as.data.frame(table(attrit$JobRole,attrit$JobInvolvement,dnn=list("JobRole","JobInvolvement"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = JobInvolvement)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle(" HR Managers are the only Role \nwithout Low Job Involvement") + xlab("Percent of Employees") + ylab("") +theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "mistyrose4")))
ggplot(data=noattrit,aes(x= JobRole,y=noattrit$NumCompaniesWorked)) +
geom_boxplot() + coord_flip() + ggtitle("Many Employees have Experience at Several Companies") + xlab("Number of Companies")+ylab("") +theme(plot.title=element_text(hjust=0.5))
Sales Rep is the only Job Role which has no employees with NumWorked = 9!!
ggplot(data=noattrit,aes(x= JobRole,y=noattrit$YearsAtCompany)) + geom_boxplot() + coord_flip() + ggtitle("Significant Differences between Tenure by Job Role \n(p-value=.0008)") + xlab("Number of Years at Company")+ylab("") +theme(plot.title=element_text(hjust=0.5))
results <- lm(noattrit$JRCode~noattrit$YearsAtCompany)
summary(results)
##
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$YearsAtCompany)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.710 -1.954 0.756 1.945 5.121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.87899 0.10601 46.025 <2e-16 ***
## noattrit$YearsAtCompany 0.02517 0.01152 2.186 0.029 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.305 on 1196 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.003979, Adjusted R-squared: 0.003146
## F-statistic: 4.778 on 1 and 1196 DF, p-value: 0.02902
The Job Role “Sales Reps” has the lowest average duration at the company.
The Job Roles of Managers have the longest average duration at the company.
ggplot(data=noattrit,aes(x= JobRole,y=YearsWithCurrManager)) + geom_boxplot() + coord_flip() + ggtitle("Years With Current Manager Vary By Job Role \n (p-value = .0425)") + xlab("")+ylab("Number of Years")+theme(plot.title=element_text(hjust=0.5))
Sales Reps change managers most frequently, while directors and managers tend to stay with the same manager for a longer time.
DepartmentDF <- as.data.frame(table(noattrit$Department))
colnames(DepartmentDF) <- c("Department","DepartmentCount")
DepartmentDF$CoDepartmntPercentage <-paste(100*as.numeric(format(DepartmentDF$DepartmentCount/colSums(DepartmentDF[2]),digits=1)),"%")
DepartmentDF$CoDepartmntPercentage <-paste(100*as.numeric(format(DepartmentDF$DepartmentCount/colSums(DepartmentDF[2]),digits=1)),"%")
print.data.frame(DepartmentDF[with(DepartmentDF, order(-DepartmentCount)),],row.names=FALSE)
## Department DepartmentCount CoDepartmntPercentage
## Research & Development 828 67 %
## Sales 354 29 %
## Human Resources 51 4 %
ggplot(data=as.data.frame(table(noattrit$Department,dnn=list("Department"))), aes(x= reorder(Department,Freq),y=Freq)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Department Sizes \nno attrition") + xlab("") + ylab("")
options(width = 150)
ggplot(data=noattrit, aes(x=noattrit$Age,fill=noattrit$Department)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cat("Summary of Age Distribution for the Company")
## Summary of Age Distribution for the Company
summary(noattrit$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 31.00 36.00 37.56 43.00 60.00
cat("Summary of Age Distributions by Department")
## Summary of Age Distributions by Department
aggdata <-aggregate(noattrit$Age, by=list(noattrit$Department),
FUN=summary, na.rm=TRUE)
print(aggdata)
## Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
## 1 Human Resources 24.00 33.50 39.00 39.63 44.50 59.00
## 2 Research & Development 18.00 31.00 36.00 37.62 43.25 60.00
## 3 Sales 18.00 31.00 36.00 37.14 42.75 60.00
ggplot(data=as.data.frame(table(noattrit$Department,noattrit$Gender,dnn=list("Department","Gender"))), aes(x= reorder(Department,Freq),y=Freq, fill = Gender)) + geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Gender by Department") + xlab("") + ylab("% Employees") + geom_hline(data=noattrit, aes(yintercept = .50), colour="black")
ggplot(data=noattrit, aes(x=noattrit$Gender,fill=Gender)) + geom_bar() + ggtitle("There are 46% more Males than Females in the Company \np-value=.04") + xlab("Gender")
#Recode Department to a numeric for t-test analysis
noattrit$DeptCode[noattrit$Department=="Research & Development"] <- 1L
noattrit$DeptCode[noattrit$Department=="Sales"] <- 2L
noattrit$DeptCode[noattrit$Department=="Human Resources"] <- 3L
t.test(table(noattrit$DeptCode, noattrit$Gender))
##
## One Sample t-test
##
## data: table(noattrit$DeptCode, noattrit$Gender)
## t = 2.7531, df = 5, p-value = 0.04016
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 13.62051 397.37949
## sample estimates:
## mean of x
## 205.5
ggplot(data=noattrit,aes(x=noattrit$JobLevel, y=noattrit$MonthlyIncome)) + geom_point() + ggtitle("Monthly Income is correlated to Job Level \n(p-value=<.0001") + ylab("Monthly Income") +xlab("Job Level") + geom_smooth(method = 'lm', se = FALSE)
results <- lm(noattrit$MonthlyIncome~noattrit$JobLevel)
summary(results)
##
## Call:
## lm(formula = noattrit$MonthlyIncome ~ noattrit$JobLevel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5119.4 -1021.7 136.6 810.0 3763.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1953.62 92.83 -21.05 <2e-16 ***
## noattrit$JobLevel 4094.33 38.37 106.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1505 on 1231 degrees of freedom
## Multiple R-squared: 0.9025, Adjusted R-squared: 0.9024
## F-statistic: 1.139e+04 on 1 and 1231 DF, p-value: < 2.2e-16
ggplot(data=as.data.frame(table(noattrit$Department,noattrit$JobInvolvement,dnn=list("Department","JobInvolvement"))), aes(x= reorder(Department,Freq),y=Freq, fill = JobInvolvement)) + geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("4.4% of the Company has a Low Level of Job Involvement") + xlab("") + ylab("") + geom_hline(data=noattrit, aes(yintercept = .9554), colour="black")
t.test(table(noattrit$DeptCode, noattrit$JobInvolvement))
##
## One Sample t-test
##
## data: table(noattrit$DeptCode, noattrit$JobInvolvement)
## t = 2.5003, df = 11, p-value = 0.02949
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 12.29995 193.20005
## sample estimates:
## mean of x
## 102.75
ggplot(data=noattrit, aes(x=factor(noattrit$YearsWithCurrManager))) + geom_histogram(stat="count",position = "dodge") +xlab("Years") + scale_x_discrete(name='Years With Current Manager') + ggtitle("Years With Current Manager")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
cat("Percentage of Employees with No College-Level Education:",paste(100*as.numeric(format(nrow(as.data.frame(noattrit[( noattrit$Education == 1),]))/1233,digits=1)),"%"))
## Percentage of Employees with No College-Level Education: 10 %
ggplot(data=noattrit, aes(x=factor(noattrit$Education))) + geom_histogram(stat="count",position = "dodge") +xlab(" ") + scale_x_discrete(name='Education') + ggtitle("Education Level")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(data=noattrit,aes(x= JobLevel,y=noattrit$TotalWorkingYears, group=JobLevel)) + geom_boxplot() + coord_flip() + ggtitle("Work Experience Correlates to Job Level") + xlab("Job Level")+ylab("Years")
#####Working Years vs Monthly Income
ggplot(data=noattrit,aes(x=noattrit$TotalWorkingYears,
y=noattrit$MonthlyIncome,col=factor(noattrit$JobLevel))) +
geom_point() + ggtitle("Total Working Years is Correlated to Job Level \np-value <.00001") + xlab("")+ylab("")
results <- lm(noattrit$JobLevel~noattrit$TotalWorkingYears)
summary(results)
##
## Call:
## lm(formula = noattrit$JobLevel ~ noattrit$TotalWorkingYears)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.06151 -0.48633 0.06362 0.40116 2.17613
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.811251 0.036340 22.32 <2e-16 ***
## noattrit$TotalWorkingYears 0.112513 0.002564 43.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6984 on 1231 degrees of freedom
## Multiple R-squared: 0.6101, Adjusted R-squared: 0.6098
## F-statistic: 1926 on 1 and 1231 DF, p-value: < 2.2e-16
ggplot(data=noattrit,aes(x=noattrit$YearsAtCompany, y=noattrit$MonthlyIncome,col=factor(noattrit$JobLevel))) +
geom_point() + ggtitle("Years At Company vs Monthly Income by JobLevel") + xlab("")+ylab("")
results <- lm(noattrit$MonthlyIncome ~ noattrit$JobLevel + noattrit$TotalWorkingYears + noattrit$YearsAtCompany)
summary(results)
##
## Call:
## lm(formula = noattrit$MonthlyIncome ~ noattrit$JobLevel + noattrit$TotalWorkingYears +
## noattrit$YearsAtCompany)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5295.9 -1004.0 90.5 849.6 3964.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1943.842 92.416 -21.034 < 2e-16 ***
## noattrit$JobLevel 3848.637 60.956 63.138 < 2e-16 ***
## noattrit$TotalWorkingYears 53.394 9.388 5.688 1.61e-08 ***
## noattrit$YearsAtCompany -15.735 8.680 -1.813 0.0701 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1487 on 1229 degrees of freedom
## Multiple R-squared: 0.905, Adjusted R-squared: 0.9047
## F-statistic: 3901 on 3 and 1229 DF, p-value: < 2.2e-16
ggplot(data=noattrit, aes(x=factor(noattrit$PerformanceRating))) + geom_histogram(stat="count",position = "dodge") +xlab(" ") + scale_x_discrete(name='Education') + ggtitle("Everyone Receives a Performance Rating of Excellent or Outstanding")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(data=noattrit,aes(x= PerformanceRating, y=noattrit$MonthlyIncome, group=PerformanceRating)) + geom_boxplot() + ggtitle("Monthly Income by Performance Rating") + xlab("Performance Rating")+ylab("Monthly Income")
ggplot(data=noattrit,aes(x=noattrit$YearsWithCurrManager, y=noattrit$YearsSinceLastPromotion)) +
geom_point() + ggtitle("The Longer You Stay with a Manager the Longer Time Between Promotions") + xlab("Years With Current Manager")+ylab("Years Since Last Promotion") + stat_smooth(method="lm", se=FALSE)
results <- lm(noattrit$YearsSinceLastPromotion ~ noattrit$YearsWithCurrManager)
summary(results)
##
## Call:
## lm(formula = noattrit$YearsSinceLastPromotion ~ noattrit$YearsWithCurrManager)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8662 -1.6387 -0.3318 0.7969 14.6682
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.33183 0.12694 2.614 0.00906 **
## noattrit$YearsWithCurrManager 0.43563 0.02245 19.407 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.832 on 1231 degrees of freedom
## Multiple R-squared: 0.2343, Adjusted R-squared: 0.2337
## F-statistic: 376.6 on 1 and 1231 DF, p-value: < 2.2e-16
ggplot(data=noattrit,aes(x= JobLevel,y=noattrit$YearsSinceLastPromotion, group=JobLevel)) + geom_boxplot() + coord_flip() + ggtitle("Time Since Last Promotion Increases as Job Level Increases") + xlab("Job Level")+ylab("Years")
ggplot(data=noattrit,aes(x= JobLevel,y=noattrit$PercentSalaryHike, group=JobLevel)) + geom_boxplot() + coord_flip() + ggtitle("All Job Levels Received About 15% Average Salary Increase \nwith roughly similar distributions") + xlab("Job Level")+ylab("% Salary Increase")
ggplot(data=noattrit,aes(x= JobSatisfaction,y=noattrit$HourlyRate, group=JobSatisfaction)) + geom_boxplot() + coord_flip() + ggtitle("Job Satisfaction Decreases as Hourly Rate Decreases \np-value=.05") + xlab("Job Satisfaction")+ylab("Hourly Rate")
ggplot(data=as.data.frame(table(noattrit$Education,noattrit$JobSatisfaction,dnn=list("Education","JobSatisfaction"))), aes(x=Education,y=Freq, fill = JobSatisfaction)) + geom_bar(stat="identity",position="fill") + ggtitle("PhDs Have the Highest Job Satisfaction \n People with Bachelor Degrees Have the Lowest Job Satisfaction \np-value=.01263") + xlab("Education Level") + ylab("") + geom_abline(slope=0, intercept=0.5, col = "black",lty=2) + coord_flip()
ggplot(data=as.data.frame(table(noattrit$Education,noattrit$JSCode,dnn=list("Education","JobSatisfaction"))), aes(x=Education,y=Freq, fill = JobSatisfaction)) + geom_bar(stat="identity",position="fill") + ggtitle("Job Satisfaction is Varies by Education") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.9, col = "black",lty=2) + coord_flip()
t.test(table(noattrit$JSCodeNum, noattrit$Education))
##
## One Sample t-test
##
## data: table(noattrit$JSCodeNum, noattrit$Education)
## t = 3.1046, df = 9, p-value = 0.01263
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 33.45881 213.14119
## sample estimates:
## mean of x
## 123.3
ggplot(data=as.data.frame(table(noattrit$Gender,noattrit$JobSatisfaction,dnn=list("Gender","JobSatisfaction"))), aes(x=Gender,y=Freq, fill = JobSatisfaction)) +
geom_bar(stat="identity",position="fill") + ggtitle("Job Satisfaction is Consistent Across Genders") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.5, col = "black",lty=2)
ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$JobSatisfaction,dnn=list("JobRole","JobSatisfaction"))), aes(x=JobRole,y=Freq, fill = JobSatisfaction)) + geom_bar(stat="identity",position="fill") + ggtitle("Job Satisfaction is Varies by Job Roles \nTop Dissatisfied Roles:R&D Director \nMost Satsifed: Sales Reps and HR Manager") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.9, col = "black",lty=2) + coord_flip()
t.test(table(noattrit$JSCodeNum, noattrit$JobRole))
##
## One Sample t-test
##
## data: table(noattrit$JSCodeNum, noattrit$JobRole)
## t = 4.1555, df = 21, p-value = 0.000448
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 27.99778 84.09313
## sample estimates:
## mean of x
## 56.04545
summary(noattrit$NumCompaniesWorked)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 2.646 4.000 9.000
ggplot(data=noattrit, aes(x=factor(noattrit$NumCompaniesWorked))) + geom_histogram(stat="count",position = "dodge") +xlab("Number CompaniesWorked") + scale_x_discrete(name='Number of Companies') +ggtitle("Number of Companies Worked")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(data=noattrit,aes(x=noattrit$MonthlyRate, y=noattrit$MonthlyIncome)) +
geom_point() + ggtitle(" No Correlation between Monthly Income vs Monthly Rate \n p-value=.261") + xlab("Monthly Rate")+ylab("Monthly Income") + stat_smooth(method="lm", se=FALSE)
result <- lm(noattrit$MonthlyIncome ~ noattrit$MonthlyRate)
summary(result)
##
## Call:
## lm(formula = noattrit$MonthlyIncome ~ noattrit$MonthlyRate)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5979 -3597 -1645 1986 13353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.522e+03 3.079e+02 21.180 <2e-16 ***
## noattrit$MonthlyRate 2.175e-02 1.933e-02 1.126 0.261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4818 on 1231 degrees of freedom
## Multiple R-squared: 0.001028, Adjusted R-squared: 0.0002168
## F-statistic: 1.267 on 1 and 1231 DF, p-value: 0.2605
ggplot(data=noattrit,aes(x=noattrit$DailyRate, y=noattrit$MonthlyIncome)) +
geom_point() + ggtitle(" No Correlation between Monthly Income vs Daily Rate") + xlab("Daily Rate")+ylab("Monthly Income") + stat_smooth(method="lm", se=FALSE)
ggplot(data=noattrit,aes(x=noattrit$DailyRate, y=noattrit$MonthlyRate)) +
geom_point() + ggtitle(" No Correlation between Monthly Rate & Daily Rate") + xlab("Daily Rate")+ylab("Monthly Rate") + stat_smooth(method="lm", se=FALSE)
ggplot(data=noattrit,aes(x=noattrit$HourlyRate, y=noattrit$DailyRate)) +
geom_point() + ggtitle(" No Correlation between Hourly Rate & Daily Rate") + xlab("Hourly Rate")+ylab("Daily Rate") + stat_smooth(method="lm", se=FALSE)
ggplot(data=noattrit,aes(x=noattrit$YearsAtCompany, y=noattrit$DailyRate,col=factor(noattrit$JobLevel))) +
geom_point() + ggtitle("Daily Rate vs Years At Company by JobLevel") + xlab("Years")+ylab("Daily Rate")
1.Employee Number - a recoded start date?….No 2.Linear Regression Modeling to id first order variables to differentiate factor variables
options(width = 150)
#Run a linear regression model using all variables to id the important first order terms
jr.lm <- lm(noattrit$JRCode ~ noattrit$BusinessTravel+ noattrit$EducationField+ noattrit$Gender+ noattrit$OverTime+ noattrit$MaritalStatus+ noattrit$Age+ noattrit$DailyRate+
noattrit$DistanceFromHome+ noattrit$Education+ noattrit$EmployeeNumber+noattrit$EnvironmentSatisfaction+ noattrit$HourlyRate+ noattrit$JobInvolvement+ noattrit$JobLevel+ noattrit$JobSatisfaction+
noattrit$MonthlyIncome+ noattrit$MonthlyRate+ noattrit$NumCompaniesWorked+ noattrit$PercentSalaryHike+ noattrit$PerformanceRating+ noattrit$RelationshipSatisfaction+ noattrit$StockOptionLevel+
noattrit$TotalWorkingYears+ noattrit$TrainingTimesLastYear+ noattrit$WorkLifeBalance+ noattrit$YearsAtCompany+ noattrit$YearsInCurrentRole+ noattrit$YearsSinceLastPromotion+ noattrit$YearsWithCurrManager)
summary(jr.lm)
##
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$BusinessTravel + noattrit$EducationField +
## noattrit$Gender + noattrit$OverTime + noattrit$MaritalStatus +
## noattrit$Age + noattrit$DailyRate + noattrit$DistanceFromHome +
## noattrit$Education + noattrit$EmployeeNumber + noattrit$EnvironmentSatisfaction +
## noattrit$HourlyRate + noattrit$JobInvolvement + noattrit$JobLevel +
## noattrit$JobSatisfaction + noattrit$MonthlyIncome + noattrit$MonthlyRate +
## noattrit$NumCompaniesWorked + noattrit$PercentSalaryHike +
## noattrit$PerformanceRating + noattrit$RelationshipSatisfaction +
## noattrit$StockOptionLevel + noattrit$TotalWorkingYears +
## noattrit$TrainingTimesLastYear + noattrit$WorkLifeBalance +
## noattrit$YearsAtCompany + noattrit$YearsInCurrentRole + noattrit$YearsSinceLastPromotion +
## noattrit$YearsWithCurrManager)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5355 -1.6075 -0.1949 1.8112 4.7772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.410e+00 1.087e+00 4.056 5.33e-05 ***
## noattrit$BusinessTravelTravel_Frequently -1.847e-01 2.466e-01 -0.749 0.45416
## noattrit$BusinessTravelTravel_Rarely -9.590e-02 2.067e-01 -0.464 0.64281
## noattrit$EducationFieldLife Sciences 1.170e+00 5.093e-01 2.297 0.02180 *
## noattrit$EducationFieldMarketing 2.600e+00 5.437e-01 4.782 1.95e-06 ***
## noattrit$EducationFieldMedical 1.083e+00 5.123e-01 2.114 0.03469 *
## noattrit$EducationFieldOther 1.002e+00 5.646e-01 1.774 0.07633 .
## noattrit$EducationFieldTechnical Degree 1.435e+00 5.486e-01 2.616 0.00901 **
## noattrit$GenderMale -2.214e-01 1.319e-01 -1.678 0.09361 .
## noattrit$OverTimeYes -4.345e-02 1.551e-01 -0.280 0.77937
## noattrit$MaritalStatusMarried 1.003e-01 1.698e-01 0.591 0.55479
## noattrit$MaritalStatusSingle 2.164e-01 2.398e-01 0.902 0.36701
## noattrit$Age -1.781e-02 1.024e-02 -1.739 0.08234 .
## noattrit$DailyRate -2.736e-05 1.609e-04 -0.170 0.86496
## noattrit$DistanceFromHome -4.919e-03 8.074e-03 -0.609 0.54250
## noattrit$Education -5.459e-03 6.475e-02 -0.084 0.93283
## noattrit$EmployeeNumber -1.061e-04 1.064e-04 -0.998 0.31872
## noattrit$EnvironmentSatisfaction -1.516e-02 6.083e-02 -0.249 0.80328
## noattrit$HourlyRate -2.095e-03 3.171e-03 -0.661 0.50902
## noattrit$JobInvolvement 2.160e-01 9.312e-02 2.320 0.02051 *
## noattrit$JobLevel -1.073e+00 1.932e-01 -5.555 3.44e-08 ***
## noattrit$JobSatisfaction 4.070e-02 5.940e-02 0.685 0.49335
## noattrit$MonthlyIncome 3.457e-04 4.427e-05 7.810 1.27e-14 ***
## noattrit$MonthlyRate 3.592e-06 9.061e-06 0.396 0.69182
## noattrit$NumCompaniesWorked -6.608e-02 2.965e-02 -2.229 0.02603 *
## noattrit$PercentSalaryHike -5.725e-03 2.766e-02 -0.207 0.83609
## noattrit$PerformanceRating 8.678e-02 2.784e-01 0.312 0.75531
## noattrit$RelationshipSatisfaction -3.219e-03 6.047e-02 -0.053 0.95755
## noattrit$StockOptionLevel 3.163e-02 1.037e-01 0.305 0.76049
## noattrit$TotalWorkingYears 1.107e-03 1.807e-02 0.061 0.95118
## noattrit$TrainingTimesLastYear -1.385e-02 5.001e-02 -0.277 0.78181
## noattrit$WorkLifeBalance -4.880e-02 9.561e-02 -0.510 0.60989
## noattrit$YearsAtCompany 4.173e-02 2.366e-02 1.764 0.07807 .
## noattrit$YearsInCurrentRole -2.674e-02 2.927e-02 -0.913 0.36118
## noattrit$YearsSinceLastPromotion 1.450e-02 2.607e-02 0.556 0.57803
## noattrit$YearsWithCurrManager -7.326e-02 2.980e-02 -2.459 0.01409 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.206 on 1162 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.114, Adjusted R-squared: 0.08734
## F-statistic: 4.273 on 35 and 1162 DF, p-value: 6.053e-15
#Check revised model fit
jr.lm <- lm(noattrit$JRCode ~noattrit$EducationField+ noattrit$Age + noattrit$JobLevel+ noattrit$MonthlyIncome+ noattrit$Gender+ noattrit$JobInvolvement+ noattrit$NumCompaniesWorked+ noattrit$YearsAtCompany+ noattrit$YearsWithCurrManager)
summary(jr.lm)
##
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$EducationField + noattrit$Age +
## noattrit$JobLevel + noattrit$MonthlyIncome + noattrit$Gender +
## noattrit$JobInvolvement + noattrit$NumCompaniesWorked + noattrit$YearsAtCompany +
## noattrit$YearsWithCurrManager)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3740 -1.5687 -0.2193 1.8478 4.8356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.276e+00 6.369e-01 6.714 2.93e-11 ***
## noattrit$EducationFieldLife Sciences 1.141e+00 5.016e-01 2.274 0.02313 *
## noattrit$EducationFieldMarketing 2.554e+00 5.357e-01 4.768 2.09e-06 ***
## noattrit$EducationFieldMedical 1.057e+00 5.037e-01 2.098 0.03610 *
## noattrit$EducationFieldOther 9.505e-01 5.568e-01 1.707 0.08809 .
## noattrit$EducationFieldTechnical Degree 1.389e+00 5.399e-01 2.573 0.01021 *
## noattrit$Age -1.773e-02 8.586e-03 -2.065 0.03915 *
## noattrit$JobLevel -1.073e+00 1.881e-01 -5.702 1.50e-08 ***
## noattrit$MonthlyIncome 3.465e-04 4.324e-05 8.015 2.63e-15 ***
## noattrit$GenderMale -2.246e-01 1.298e-01 -1.731 0.08378 .
## noattrit$JobInvolvement 2.057e-01 9.152e-02 2.247 0.02481 *
## noattrit$NumCompaniesWorked -6.591e-02 2.820e-02 -2.337 0.01958 *
## noattrit$YearsAtCompany 3.656e-02 1.897e-02 1.927 0.05418 .
## noattrit$YearsWithCurrManager -7.996e-02 2.815e-02 -2.840 0.00459 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.191 on 1184 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.1094, Adjusted R-squared: 0.09957
## F-statistic: 11.18 on 13 and 1184 DF, p-value: < 2.2e-16
js.lm <- lm(noattrit$JobSatisfaction ~ noattrit$BusinessTravel+ noattrit$EducationField+ noattrit$Gender+ noattrit$OverTime+ noattrit$MaritalStatus+ noattrit$Age+ noattrit$DailyRate+ noattrit$DistanceFromHome+ noattrit$Education+ noattrit$EmployeeNumber+ noattrit$EnvironmentSatisfaction+ noattrit$HourlyRate+ noattrit$JobInvolvement+ noattrit$JobLevel+ noattrit$JobRole+ noattrit$MonthlyIncome+ noattrit$MonthlyRate+ noattrit$NumCompaniesWorked+ noattrit$PercentSalaryHike+ noattrit$PerformanceRating+ noattrit$RelationshipSatisfaction+ noattrit$StockOptionLevel+ noattrit$TotalWorkingYears+ noattrit$TrainingTimesLastYear+ noattrit$WorkLifeBalance+ noattrit$YearsAtCompany+ noattrit$YearsInCurrentRole+ noattrit$YearsSinceLastPromotion+ noattrit$YearsWithCurrManager)
summary(js.lm)
##
## Call:
## lm(formula = noattrit$JobSatisfaction ~ noattrit$BusinessTravel +
## noattrit$EducationField + noattrit$Gender + noattrit$OverTime +
## noattrit$MaritalStatus + noattrit$Age + noattrit$DailyRate +
## noattrit$DistanceFromHome + noattrit$Education + noattrit$EmployeeNumber +
## noattrit$EnvironmentSatisfaction + noattrit$HourlyRate +
## noattrit$JobInvolvement + noattrit$JobLevel + noattrit$JobRole +
## noattrit$MonthlyIncome + noattrit$MonthlyRate + noattrit$NumCompaniesWorked +
## noattrit$PercentSalaryHike + noattrit$PerformanceRating +
## noattrit$RelationshipSatisfaction + noattrit$StockOptionLevel +
## noattrit$TotalWorkingYears + noattrit$TrainingTimesLastYear +
## noattrit$WorkLifeBalance + noattrit$YearsAtCompany + noattrit$YearsInCurrentRole +
## noattrit$YearsSinceLastPromotion + noattrit$YearsWithCurrManager)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3281 -0.8023 0.1658 1.0144 1.7461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.388e+00 5.759e-01 5.884 5.2e-09 ***
## noattrit$BusinessTravelTravel_Frequently 3.423e-05 1.213e-01 0.000 0.9998
## noattrit$BusinessTravelTravel_Rarely -1.124e-01 1.015e-01 -1.107 0.2686
## noattrit$EducationFieldLife Sciences -9.068e-02 3.226e-01 -0.281 0.7787
## noattrit$EducationFieldMarketing -3.070e-01 3.443e-01 -0.892 0.3727
## noattrit$EducationFieldMedical -2.194e-01 3.244e-01 -0.676 0.4991
## noattrit$EducationFieldOther -2.565e-01 3.439e-01 -0.746 0.4559
## noattrit$EducationFieldTechnical Degree -1.939e-01 3.393e-01 -0.571 0.5678
## noattrit$GenderMale 8.196e-02 6.457e-02 1.269 0.2046
## noattrit$OverTimeYes 1.514e-01 7.572e-02 2.000 0.0457 *
## noattrit$MaritalStatusMarried 6.284e-02 8.295e-02 0.758 0.4488
## noattrit$MaritalStatusSingle 2.900e-01 1.170e-01 2.479 0.0133 *
## noattrit$Age 5.861e-04 5.039e-03 0.116 0.9074
## noattrit$DailyRate 7.588e-05 7.867e-05 0.965 0.3350
## noattrit$DistanceFromHome 2.969e-03 3.932e-03 0.755 0.4503
## noattrit$Education 6.693e-03 3.164e-02 0.211 0.8325
## noattrit$EmployeeNumber -6.788e-05 5.215e-05 -1.302 0.1933
## noattrit$EnvironmentSatisfaction -3.693e-02 2.976e-02 -1.241 0.2150
## noattrit$HourlyRate -3.215e-03 1.548e-03 -2.076 0.0381 *
## noattrit$JobInvolvement -5.204e-02 4.582e-02 -1.136 0.2563
## noattrit$JobLevel 2.744e-02 1.039e-01 0.264 0.7918
## noattrit$JobRoleHuman Resources -1.049e-01 2.361e-01 -0.444 0.6569
## noattrit$JobRoleHuman Resources Manager -7.513e-02 4.247e-01 -0.177 0.8596
## noattrit$JobRoleLaboratory Technician 1.770e-02 1.447e-01 0.122 0.9027
## noattrit$JobRoleManufacturing Director -9.222e-02 1.380e-01 -0.668 0.5041
## noattrit$JobRoleResearch & Development Manager -4.119e-02 2.440e-01 -0.169 0.8660
## noattrit$JobRoleResearch Director -7.286e-02 2.153e-01 -0.338 0.7351
## noattrit$JobRoleResearch Scientist 5.290e-02 1.407e-01 0.376 0.7070
## noattrit$JobRoleSales Executive 7.784e-02 1.301e-01 0.598 0.5497
## noattrit$JobRoleSales Manager 4.968e-02 2.636e-01 0.188 0.8506
## noattrit$JobRoleSales Representative 8.984e-02 2.031e-01 0.442 0.6583
## noattrit$MonthlyIncome 4.124e-06 2.776e-05 0.149 0.8819
## noattrit$MonthlyRate -5.035e-06 4.429e-06 -1.137 0.2558
## noattrit$NumCompaniesWorked -2.026e-02 1.441e-02 -1.406 0.1600
## noattrit$PercentSalaryHike 1.693e-02 1.361e-02 1.244 0.2137
## noattrit$PerformanceRating -3.150e-02 1.368e-01 -0.230 0.8180
## noattrit$RelationshipSatisfaction -4.988e-02 2.961e-02 -1.685 0.0923 .
## noattrit$StockOptionLevel 5.927e-02 5.093e-02 1.164 0.2447
## noattrit$TotalWorkingYears -3.949e-03 8.865e-03 -0.446 0.6560
## noattrit$TrainingTimesLastYear -5.648e-03 2.459e-02 -0.230 0.8184
## noattrit$WorkLifeBalance -2.412e-02 4.651e-02 -0.519 0.6041
## noattrit$YearsAtCompany -8.128e-05 1.095e-02 -0.007 0.9941
## noattrit$YearsInCurrentRole 2.683e-03 1.387e-02 0.193 0.8466
## noattrit$YearsSinceLastPromotion -7.644e-03 1.242e-02 -0.615 0.5384
## noattrit$YearsWithCurrManager -1.587e-02 1.442e-02 -1.101 0.2712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.091 on 1188 degrees of freedom
## Multiple R-squared: 0.0395, Adjusted R-squared: 0.003926
## F-statistic: 1.11 on 44 and 1188 DF, p-value: 0.2889
js.lm <- lm(noattrit$JobSatisfaction ~noattrit$OverTime+noattrit$MaritalStatus+noattrit$HourlyRate + noattrit$RelationshipSatisfaction)
summary(js.lm)
##
## Call:
## lm(formula = noattrit$JobSatisfaction ~ noattrit$OverTime + noattrit$MaritalStatus +
## noattrit$HourlyRate + noattrit$RelationshipSatisfaction)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1355 -0.8005 0.1982 1.1236 1.4946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.018010 0.143969 20.963 <2e-16 ***
## noattrit$OverTimeYes 0.132104 0.073547 1.796 0.0727 .
## noattrit$MaritalStatusMarried 0.037520 0.077789 0.482 0.6297
## noattrit$MaritalStatusSingle 0.191990 0.086500 2.220 0.0266 *
## noattrit$HourlyRate -0.003140 0.001524 -2.060 0.0396 *
## noattrit$RelationshipSatisfaction -0.049647 0.029088 -1.707 0.0881 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.089 on 1227 degrees of freedom
## Multiple R-squared: 0.01252, Adjusted R-squared: 0.008492
## F-statistic: 3.11 on 5 and 1227 DF, p-value: 0.00852
#####Note: The fit is poor, but may provide directionality as to first order factors to assess.
ggplot(data=as.data.frame(table(attrit$JobSatisfaction,attrit$MaritalStatus,dnn=list("JobSatisfaction","MaritalStatus"))), aes(x= reorder(JobSatisfaction,Freq),y=Freq, fill = MaritalStatus)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Job Satisfaction by Marital Status is Consistent") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.5, col = "black",lty=2)
ggplot(data=as.data.frame(table(attrit$JobSatisfaction,attrit$RelationshipSatisfaction,dnn=list("JobSatisfaction","RelationshipSatisfaction"))), aes(x= reorder(JobSatisfaction,Freq),y=Freq, fill = factor(RelationshipSatisfaction))) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Job Satisfaction by Relationship Satisfaction is Consistent") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.5, col = "black",lty=2)
JS.4 <- noattrit[noattrit$JobSatisfaction==4,]
JS.3 <- noattrit[noattrit$JobSatisfaction==3,]
highjs <- rbind(JS.3, JS.4)
demohighjs<- highjs[1:8]
demohighjs$Attrition <- NULL
demohighjs$BusinessTravel <- NULL
demohighjs$OverTime <- NULL
demohighjs$MaritalStatus <- NULL
countdemohighjs<- count(demohighjs)
head(countdemohighjs[order(countdemohighjs$freq,decreasing = TRUE),],10)
## Department EducationField Gender JobRole freq
## 21 Research & Development Life Sciences Male Research Scientist 53
## 60 Sales Life Sciences Male Sales Executive 39
## 17 Research & Development Life Sciences Male Laboratory Technician 38
## 33 Research & Development Medical Male Research Scientist 35
## 16 Research & Development Life Sciences Male Healthcare Representative 31
## 66 Sales Marketing Male Sales Executive 30
## 29 Research & Development Medical Male Laboratory Technician 29
## 15 Research & Development Life Sciences Female Research Scientist 28
## 57 Sales Life Sciences Female Sales Executive 25
## 63 Sales Marketing Female Sales Executive 24
*(comment: Joblevel description was missing from the original dataset, so we only have the category number).